This is a project on conducting Cohort and Customer Churn Analysis on online retail sales data. Online Sales data was collected between 01/12/2010 and 09/12/2011 for a UK-based online retail store. We will examine the retention rate of customers over a 13 month period. First we remove the N/A’s from our data set and then use the “cohorts” package in R to create a cohort table. Then we will use a line graph and tile chart to visualize the 13 cohorts over the months.
Original dataset: https://www.kaggle.com/datasets/ersany/online-retail-dataset
install.packages('tidyverse', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages
install.packages('readxl', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\Steve\AppData\Local\R\win-library\4.2\00LOCK\readxl\libs\x64\readxl.dll
## to C:\Users\Steve\AppData\Local\R\win-library\4.2\readxl\libs\x64\readxl.dll:
## Permission denied
## Warning: restored 'readxl'
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages
install.packages('cohorts', repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Steve/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'cohorts' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Steve\AppData\Local\Temp\RtmpeCCFDj\downloaded_packages
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
library(cohorts)
df <- read_xlsx('Online Retail.xlsx')
str(df)
## tibble [541,909 × 8] (S3: tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
summary(df)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00.00 Min. :-11062.06 Min. :12346
## 1st Qu.:2011-03-28 11:34:00.00 1st Qu.: 1.25 1st Qu.:13953
## Median :2011-07-19 17:17:00.00 Median : 2.08 Median :15152
## Mean :2011-07-04 13:34:57.16 Mean : 4.61 Mean :15288
## 3rd Qu.:2011-10-19 11:27:00.00 3rd Qu.: 4.13 3rd Qu.:16791
## Max. :2011-12-09 12:50:00.00 Max. : 38970.00 Max. :18287
## NA's :135080
## Country
## Length:541909
## Class :character
## Mode :character
##
##
##
##
df1 <- na.omit(df)
df1$YMD <- as.Date(df1$InvoiceDate)
cohort_data <- df1 %>% select(CustomerID, YMD)
cohort_data %>% cohort_table_month(CustomerID, YMD)
## # A tibble: 13 × 14
## cohort `Dec 2010` `Jan 2011` Feb 20…¹ Mar 2…² Apr 2…³ May 2…⁴ Jun 2…⁵ Jul 2…⁶
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 948 362 317 367 341 376 360 336
## 2 2 NA 421 101 119 102 138 126 110
## 3 3 NA NA 380 94 73 106 102 94
## 4 4 NA NA NA 440 84 112 96 102
## 5 5 NA NA NA NA 299 68 66 63
## 6 6 NA NA NA NA NA 279 66 48
## 7 7 NA NA NA NA NA NA 235 49
## 8 8 NA NA NA NA NA NA NA 191
## 9 9 NA NA NA NA NA NA NA NA
## 10 10 NA NA NA NA NA NA NA NA
## 11 11 NA NA NA NA NA NA NA NA
## 12 12 NA NA NA NA NA NA NA NA
## 13 13 NA NA NA NA NA NA NA NA
## # … with 5 more variables: `Aug 2011` <int>, `Sep 2011` <int>,
## # `Oct 2011` <int>, `Nov 2011` <int>, `Dec 2011` <int>, and abbreviated
## # variable names ¹`Feb 2011`, ²`Mar 2011`, ³`Apr 2011`, ⁴`May 2011`,
## # ⁵`Jun 2011`, ⁶`Jul 2011`
cohort_data %>% cohort_table_month(CustomerID, YMD) %>% cohort_table_pct()
## # A tibble: 13 × 14
## cohort `Dec 2010` `Jan 2011` Feb 20…¹ Mar 2…² Apr 2…³ May 2…⁴ Jun 2…⁵ Jul 2…⁶
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 100 38.2 33.4 38.7 36 39.7 38 35.4
## 2 2 NA 100 24 28.3 24.2 32.8 29.9 26.1
## 3 3 NA NA 100 24.7 19.2 27.9 26.8 24.7
## 4 4 NA NA NA 100 19.1 25.5 21.8 23.2
## 5 5 NA NA NA NA 100 22.7 22.1 21.1
## 6 6 NA NA NA NA NA 100 23.7 17.2
## 7 7 NA NA NA NA NA NA 100 20.9
## 8 8 NA NA NA NA NA NA NA 100
## 9 9 NA NA NA NA NA NA NA NA
## 10 10 NA NA NA NA NA NA NA NA
## 11 11 NA NA NA NA NA NA NA NA
## 12 12 NA NA NA NA NA NA NA NA
## 13 13 NA NA NA NA NA NA NA NA
## # … with 5 more variables: `Aug 2011` <dbl>, `Sep 2011` <dbl>,
## # `Oct 2011` <dbl>, `Nov 2011` <dbl>, `Dec 2011` <dbl>, and abbreviated
## # variable names ¹`Feb 2011`, ²`Mar 2011`, ³`Apr 2011`, ⁴`May 2011`,
## # ⁵`Jun 2011`, ⁶`Jul 2011`
cohort_data %>% cohort_table_month(CustomerID, YMD) %>% cohort_table_pct() %>% shift_left()
## # A tibble: 13 × 14
## cohort t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 100 38.2 33.4 38.7 36 39.7 38 35.4 35.4 39.5 37.3
## 2 2 100 24 28.3 24.2 32.8 29.9 26.1 25.7 31.1 34.7 36.8
## 3 3 100 24.7 19.2 27.9 26.8 24.7 25.5 28.2 25.8 31.3 9.2
## 4 4 100 19.1 25.5 21.8 23.2 17.7 26.4 23.9 28.9 8.9 0
## 5 5 100 22.7 22.1 21.1 20.7 23.7 23.1 26.1 8.4 0 0
## 6 6 100 23.7 17.2 17.2 21.5 24.4 26.5 10.4 0 0 0
## 7 7 100 20.9 18.7 27.2 24.7 33.6 10.2 0 0 0 0
## 8 8 100 20.9 20.4 23 27.2 11.5 0 0 0 0 0
## 9 9 100 25.1 25.1 25.1 13.8 0 0 0 0 0 0
## 10 10 100 29.9 32.6 12.1 0 0 0 0 0 0 0
## 11 11 100 26.4 13.1 0 0 0 0 0 0 0 0
## 12 12 100 13.4 0 0 0 0 0 0 0 0 0
## 13 13 100 0 0 0 0 0 0 0 0 0 0
## # … with 2 more variables: t11 <dbl>, t12 <dbl>
cohort_data %>% cohort_table_month(CustomerID, YMD) %>% shift_left_pct()
## # A tibble: 13 × 14
## cohort t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 100 38.2 33.4 38.7 36 39.7 38 35.4 35.4 39.5 37.3
## 2 2 100 24 28.3 24.2 32.8 29.9 26.1 25.7 31.1 34.7 36.8
## 3 3 100 24.7 19.2 27.9 26.8 24.7 25.5 28.2 25.8 31.3 9.2
## 4 4 100 19.1 25.5 21.8 23.2 17.7 26.4 23.9 28.9 8.9 0
## 5 5 100 22.7 22.1 21.1 20.7 23.7 23.1 26.1 8.4 0 0
## 6 6 100 23.7 17.2 17.2 21.5 24.4 26.5 10.4 0 0 0
## 7 7 100 20.9 18.7 27.2 24.7 33.6 10.2 0 0 0 0
## 8 8 100 20.9 20.4 23 27.2 11.5 0 0 0 0 0
## 9 9 100 25.1 25.1 25.1 13.8 0 0 0 0 0 0
## 10 10 100 29.9 32.6 12.1 0 0 0 0 0 0 0
## 11 11 100 26.4 13.1 0 0 0 0 0 0 0 0
## 12 12 100 13.4 0 0 0 0 0 0 0 0 0
## 13 13 100 0 0 0 0 0 0 0 0 0 0
## # … with 2 more variables: t11 <dbl>, t12 <dbl>
cohort_data_long <- cohort_data %>% cohort_table_month(CustomerID, YMD) %>%
shift_left_pct() %>% pivot_longer(-cohort) %>% mutate(time=as.numeric(str_remove(name,"t")))
cohort_data_long %>%
filter(value > 0,cohort <=13, time>=0) %>%
ggplot(aes(time,value,colour=factor(cohort), group=cohort)) +
geom_line(size = 1.5) +
geom_point(size = 1.5) +
theme_light() + labs(y="Retention Rate (%)", x="Month") +
ggtitle("Line Graph of Retention Rates") + theme(plot.title = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
cohort_data_long %>%
filter(time >= 0, value > 0) %>%
ggplot(aes(time, reorder(cohort, desc(cohort)))) +
geom_raster(aes(fill = log(value))) +
coord_equal(ratio = 1) +
geom_text(aes(label = glue::glue("{round(value,0)}%")),
size = 3,
colour = "snow") +
scale_fill_gradient(guide = "none") +
theme_minimal(base_size = 16) +
theme(panel.grid = element_blank(),
panel.border = element_blank()) +
labs(y = "Cohort",x="Month") +
ggtitle("Cohort Tile Chart for Retention Rate") + theme(plot.title = element_text(hjust = 0.5))